DATA IMPORTATION AND CLEANING
library(classdata)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tidyr' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plyr)
## Warning: package 'plyr' was built under R version 4.1.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(matrixStats)
## Warning: package 'matrixStats' was built under R version 4.1.3
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:plyr':
##
## count
## The following object is masked from 'package:dplyr':
##
## count
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Columns used from the data tables.
Columns = c('X', 'Canada', 'Saudi.Arabia', 'United.States', 'India', 'Russian.Federation', 'South.Africa', 'Turkey', 'Brazil', 'Mexico', 'France', 'Germany', 'Italy', 'United.Kingdom', 'China', 'Sweden', 'Japan', 'Korea..Rep.', 'Switzerland', 'Spain', 'Italy', 'Belgium')
#Countries in the G20 + some to get the total to 20. Some of the G20 countries had no data.
Countries = c('Canada', 'Saudi.Arabia', 'United.States', 'India', 'Russian.Federation', 'South.Africa', 'Turkey', 'Brazil', 'Mexico', 'France', 'Germany', 'Italy', 'United.Kingdom', 'China', 'Sweden', 'Japan', 'Korea..Rep.', 'Switzerland', 'Spain', 'Italy', 'Belgium')
CPI <- read.csv("CPI.csv") #Used in 2, 3
CPIG20 <- CPI %>% select(one_of(Columns))
CPIG20 <- CPIG20[-1, ] #remove null first row
CPIG20 <- CPIG20[-1, ] #remove 1990
CPIG20 <- head(CPIG20, - 1) #remove last row (empty on most tables)
rownames(CPIG20) <- NULL #Reset row values
FReserves <- read.csv("ForeignReserves.csv") #Used in 6
FReservesG20 <- FReserves %>% select(one_of(Columns))
FReservesG20 <- FReservesG20[-1, ] #remove null first row
FReservesG20 <- head(FReservesG20, - 1) #remove last row
rownames(FReservesG20) <- NULL #Reset row values
GDP <- read.csv("GDP.csv") #Used in 1
GDPG20 <- GDP %>% select(one_of(Columns))
GDPG20 <- GDPG20[-1, ] #remove null first row
GDPG20 <- GDPG20[-1, ] #remove 1990
GDPG20 <- head(GDPG20, - 1) #remove last row (empty on most tables)
rownames(GDPG20) <- NULL #Reset row values
StockMarket <- read.csv("StockMarket.csv") #Used in 4
StockMarketG20 <- StockMarket %>% select(one_of(Columns))
StockMarketG20 <- StockMarketG20[-1, ] #remove null first row
StockMarketG20 <- StockMarketG20[-1, ] #remove 1990
StockMarketG20 <- head(StockMarketG20, - 1) #remove last row (empty on most tables)
rownames(StockMarketG20) <- NULL #Reset row values
TotalReserves <- read.csv("TotalReserves.csv") #Used in 6
TotalReservesG20 <- TotalReserves %>% select(one_of(Columns))
TotalReservesG20 <- TotalReservesG20[-1, ] #remove null first row
TotalReservesG20 <- TotalReservesG20[-1, ] #remove 1990
TotalReservesG20 <- head(TotalReservesG20, - 1) #remove last row (empty on most tables)
rownames(TotalReservesG20) <- NULL #Reset row values
UnemploymentRate <- read.csv("UnemploymentRate.csv") #Used in 5
UnemploymentRateG20 <- UnemploymentRate %>% select(one_of(Columns))
UnemploymentRateG20 <- UnemploymentRateG20[-1, ] #remove null first row
UnemploymentRateG20 <- UnemploymentRateG20[-1, ] #remove 1990
UnemploymentRateG20 <- head(UnemploymentRateG20, - 1) #remove last row (empty on most tables)
rownames(UnemploymentRateG20) <- NULL #Reset row values
Expectations: We expect mostly western countries to have the highest GDP. This is because western countries prioritize freedom and capitalism. We expect this support to show up in the data, and we expect the plots to be mostly related.
Post Code: The 5 counties with the highest GDP in 2018 are: The United States, China, Japan, Germany, and UK in that order. We can see that the plots between the western countries are highly coordinated. This makes sense as these countries have both military and trade alliances. China is a notable outlier in this data. This country shows an exceptional growth rate starting in about 2005. China is an outlier in another way too, but this time joined by Japan. The US, Germany and the UK all suffered badly from 2008 and you can see extreme GDP loss and growth loss. Both Germany and the UK have yet to meaningfully break above it’s 2008 max. Overall our expectations were met.
summary(GDPG20)
## X Canada Saudi.Arabia United.States
## Min. :1991 Min. : 579059 Min. :328375 Min. : 6158100
## 1st Qu.:1998 1st Qu.: 649782 1st Qu.:451739 1st Qu.: 8941500
## Median :2004 Median :1101535 Median :649039 Median :12625200
## Mean :2004 Mean :1142943 Mean :591157 Mean :12630757
## 3rd Qu.:2011 3rd Qu.:1576904 3rd Qu.:724157 3rd Qu.:15706300
## Max. :2018 Max. :1846649 Max. :782193 Max. :20494200
## NA's :14
## India Russian.Federation South.Africa Turkey
## Min. : 290308 Min. : 115811 Min. :116612 Min. :187473
## 1st Qu.: 489428 1st Qu.: 340399 1st Qu.:138155 1st Qu.:251367
## Median :1141455 Median : 934606 Median :243800 Median :450405
## Mean :1228519 Mean :1023202 Mean :239696 Mean :515492
## 3rd Qu.:1840370 3rd Qu.:1613752 3rd Qu.:326310 3rd Qu.:790821
## Max. :2718578 Max. :2280211 Max. :417641 Max. :948785
## NA's :5 NA's :2
## Brazil Mexico France Germany
## Min. : 511777 Min. : 352035 Min. :1360939 Min. :1954674
## 1st Qu.: 669195 1st Qu.: 527009 1st Qu.:1495213 1st Qu.:2222652
## Median :1106836 Median : 829427 Median :2155390 Median :2836442
## Mean :1363320 Mean : 840095 Mean :2089589 Mean :2893531
## 3rd Qu.:1877122 3rd Qu.:1127903 3rd Qu.:2669880 3rd Qu.:3503618
## Max. :2617293 Max. :1313676 Max. :2926868 Max. :4006795
## NA's :3
## Italy United.Kingdom China Sweden
## Min. :1004008 Min. :1151618 Min. : 488522 Min. :210153
## 1st Qu.:1244337 1st Qu.:1609003 1st Qu.: 1060124 1st Qu.:266917
## Median :1815878 Median :2405983 Median : 2282510 Median :385760
## Mean :1667115 Mean :2153345 Mean : 4522453 Mean :390382
## 3rd Qu.:2087325 3rd Qu.:2690112 3rd Qu.: 8027645 3rd Qu.:513633
## Max. :2399198 Max. :3085993 Max. :13601165 Max. :580017
## NA's :1
## Japan Korea..Rep. Switzerland Spain
## Min. :3590619 Min. : 324177 Min. :261865 Min. : 452311
## 1st Qu.:4444592 1st Qu.: 552106 1st Qu.:292585 1st Qu.: 615478
## Median :4827492 Median : 831865 Median :401480 Median :1114377
## Mean :4793066 Mean : 871664 Mean :455377 Mean :1006271
## 3rd Qu.:4993991 3rd Qu.:1208254 3rd Qu.:668661 3rd Qu.:1366484
## Max. :6202415 Max. :1620151 Max. :710774 Max. :1640875
##
## Belgium
## Min. :219723
## 1st Qu.:259059
## Median :379502
## Mean :374437
## 3rd Qu.:488713
## Max. :532337
##
#Countries with the highest GDP in 2018 are United States, China, Japan, Germany, UK
#Plot
plot <- data.frame(data = GDPG20,
x = GDPG20$X,
y = c(GDPG20$United.States,
GDPG20$China,
GDPG20$Japan,
GDPG20$Germany,
GDPG20$United.Kingdom),
group = rep(c("United States", "China", "Japan", "Germany", "United Kingdom"), each = 28))
ggplot(plot, aes(x, y, col = group)) +
geom_line() +
labs(title = "Country GDP per Year", y = "GDP", x = "Year")
## Warning: Removed 1 row(s) containing missing values (geom_path).
Expectations: As with the last qeustion we expect mostly western countries to have healthy data. We think there may be some blow outs with inflation, but we expect ~200% to be the maximum.
Post Code: From the chart we can see there are three countries which have an extreme amount of total cumulative inflation. Brazil, Russia, and Turkey all have over 1000% of cumulative total inflation. This is extreme and is deleterious to the economy of these nations. However, we can also see the flip side to that unhealthy coin when we view Japan. After 28 years Japan only has a cumulative inflation value of 10.7. This is low and unhealthy. This means on average Japan only had .38% inflation per year. Mexico, South Africa, China, India, Korea, Spain, Sweden, France, and Switzerland all had somewhat healthy rates. While the rest of the nations had healthy cumulative inflation rates. Our expections were somewhat met. We had not anticipated over 1000% inflation for some countries. On top of this most western countries were healthy.
test = which(unlist(lapply(CPIG20, is.numeric)))
CPIG20[(nrow(CPIG20) + 1), test] <- colSums(CPIG20[, test], na.rm=TRUE)
#add a row that is cumulative inflation
#Create a table of just Country and Total Inflation
TotalCPI = CPIG20 %>%
rowid_to_column() %>%
filter(rowid == 29) %>%
gather(col, val, Canada:Belgium) %>%
select(-rowid, -X)
names(TotalCPI) <- c("Country", "Cum.Infaltion")
#Plot
MaxInflationPerCountryFigure <- ggplot(TotalCPI, aes(reorder(Country, -Cum.Infaltion), Cum.Infaltion)) +
scale_y_continuous(trans='log10') +
geom_col() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Cumulative Inflation (1991 - 2018) per Country", y = "Cumulative Inflation", x = "Country")
MaxInflationPerCountryFigure <- ggplotly(MaxInflationPerCountryFigure)
MaxInflationPerCountryFigure
Expectations: Based on the last question I think it is safe to assume Brazil, Russia, and Turkey will be the leaders for most years.
Post Code: We can see that in the first 3 years of the data there was a lot of inflation for Brazil and Russia. These values are so high they would almost be considered an outlier here. Instead, we can look at the year and history to see these countries had lots of internal issues. Especially in Russia as the USSR would have dissolved in 1991. From this we can conclude that stability of a country is necessary to have stable inflation. Our expectation were met everywhere expect 1 spot. We did not expect India to show up as a yearly inflation leader.
test = CPIG20 %>% replace(is.na(.), 0) #make all na values 0 in the table
YearlyMaxData <- data.frame(matrix(ncol = 3, nrow = 0))
#create new empty table
#for each year (1991 - 2018) find the country with the highest inflation. Then append that data to YearlyMaxData
for (x in 1:28) {
#Find data
InterYearMax = test %>%
rowid_to_column() %>%
filter(rowid == x) %>%
gather(col, val, Canada:Belgium) %>%
filter(val == max(val)) %>%
select(-rowid)
#Append data
YearlyMaxData = rbind(YearlyMaxData, InterYearMax)
}
#Plot
MaxInflationPerYearFigure <- ggplot(data=YearlyMaxData, aes(x=X, y=val, fill=col)) +
scale_y_continuous(trans='log10') +
geom_bar(stat="identity") +
labs(title = "Highest Country Inflation (1991 - 2018) per Year", y = "Inflation", x = "Year")
MaxInflationPerYearFigure <- ggplotly(MaxInflationPerYearFigure)
MaxInflationPerYearFigure
Expectations: We expect mostly western countries to have the highest stock market cap. And we expect to be able to see the dot com bubble and the 2008 housing bubble.
Post Code: There are 2 notables facts about the data. The first is that Japan’s stock market cap starts out so high compared to the rest. It starts about 5 times bigger than any other country. And on top of this Japan does not keep the lead. By 1998 the US and Japan are about equal. By 2001 the United States was ahead. The other interesting part is that you can clearly see the 2008 financial crisis happen. Each country screamed higher on the years leading up and the years after it went down. Other than that the information is not surprising. Our expectations were fully met here.
summary(StockMarketG20)
## X Canada Saudi.Arabia United.States
## Min. :1991 Min. : 25.24 Min. : 20.37 Min. : 27.98
## 1st Qu.:1998 1st Qu.: 40.87 1st Qu.: 34.16 1st Qu.: 79.55
## Median :2004 Median : 67.04 Median :100.43 Median :100.33
## Mean :2004 Mean : 70.95 Mean : 89.05 Mean :103.91
## 3rd Qu.:2011 3rd Qu.:106.79 3rd Qu.:125.11 3rd Qu.:124.32
## Max. :2018 Max. :118.62 Max. :204.09 Max. :239.19
## NA's :3
## India Russian.Federation South.Africa Turkey
## Min. : 17.10 Min. : 7.585 Min. : 26.18 Min. : 16.11
## 1st Qu.: 25.56 1st Qu.: 29.620 1st Qu.: 76.32 1st Qu.: 24.68
## Median : 39.49 Median : 54.059 Median :105.48 Median : 59.29
## Mean : 60.82 Mean : 63.005 Mean : 90.30 Mean : 56.34
## 3rd Qu.:101.03 3rd Qu.: 94.884 3rd Qu.:115.25 3rd Qu.: 80.24
## Max. :137.83 Max. :152.166 Max. :122.21 Max. :111.24
## NA's :4 NA's :11
## Brazil Mexico France Germany
## Min. :10.79 Min. : 13.82 Min. : 35.82 Min. : 39.66
## 1st Qu.:15.87 1st Qu.: 23.23 1st Qu.: 57.35 1st Qu.: 63.42
## Median :39.60 Median : 44.97 Median : 82.18 Median : 88.57
## Mean :42.90 Mean : 62.65 Mean : 80.54 Mean : 95.88
## 3rd Qu.:65.04 3rd Qu.:104.65 3rd Qu.:100.48 3rd Qu.:127.25
## Max. :99.28 Max. :131.81 Max. :140.24 Max. :173.16
## NA's :3 NA's :5
## Italy United.Kingdom China Sweden
## Min. : 58.50 Min. : 46.88 Min. : 31.62 Min. : 19.97
## 1st Qu.: 77.66 1st Qu.: 73.95 1st Qu.: 77.17 1st Qu.: 47.32
## Median : 84.82 Median :100.33 Median : 89.56 Median : 79.53
## Mean : 96.02 Mean : 94.20 Mean : 86.46 Mean : 84.43
## 3rd Qu.:115.26 3rd Qu.:114.87 3rd Qu.: 96.87 3rd Qu.:123.92
## Max. :170.17 Max. :148.01 Max. :126.97 Max. :156.86
## NA's :6 NA's :14
## Japan Korea..Rep. Switzerland Spain
## Min. : 68.89 Min. : 73.55 Min. : 18.05 Min. : 17.58
## 1st Qu.: 97.47 1st Qu.: 99.40 1st Qu.: 56.77 1st Qu.: 38.34
## Median :122.86 Median :117.00 Median : 79.33 Median : 56.28
## Mean :120.58 Mean :142.40 Mean : 84.09 Mean : 58.08
## 3rd Qu.:139.02 3rd Qu.:147.42 3rd Qu.:110.16 3rd Qu.: 76.08
## Max. :172.82 Max. :306.33 Max. :148.02 Max. :128.38
## NA's :5
## Belgium
## Min. : 38.80
## 1st Qu.: 62.89
## Median : 91.24
## Mean : 88.88
## 3rd Qu.:110.69
## Max. :168.11
##
#3 Countries with the smallest cap at the start are: India, Mexico, Spain
#3 Countries with the largest cap at the end are: US, Germany, Japan
#Plot
plot <- data.frame(data = StockMarketG20,
x = StockMarketG20$X,
y = c(StockMarketG20$India,
StockMarketG20$Mexico,
StockMarketG20$Spain,
StockMarketG20$United.States,
StockMarketG20$Germany,
StockMarketG20$Japan),
group = rep(c("India", "Mexico", "Spain", "United States", "Germany", "Japan"), each = 28))
ggplot(plot, aes(x, y, col = group)) +
geom_line() +
labs(title = "Country Stock Market Cap per Year", y = "Stock Market Cap", x = "Year")
## Warning: Removed 5 row(s) containing missing values (geom_path).
Expectations: We do not know what to expect with this data. Our only guess is that for some the rate may stay high.
Post Code: First we can see that South Africa has a really high unemployment rate and it seems to be consistently high. The variation for the data from south aftica is extremely low. On the other hand we can see countries with extremely high variation. Spain has a huge range in it’s data. The range from Q1 to Q3 of Spain is bigger than the total range of any other country. Finally we can see that a “healthy” range of unemployment seems to be around ~6%. We can come to this conclusion by looking at countries that are healthy and wealthy from the other questions. These countries include: Germany, UK, US. Going in we had almost no expectations and the one we slightly had did manifest with South Africa. Spain having a wide spread is notable.
#Plot
UnemploymentRateG20 %>% select(Countries) %>%
pivot_longer(., cols = Countries, names_to = "Var", values_to = "Val") %>%
ggplot(aes(x = Var, y = Val)) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
geom_boxplot() +
labs(title = "Boxplot of unemployment % per country", y = "Unemployment %", x = "Country")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(Countries)` instead of `Countries` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Warning: Removed 120 rows containing non-finite values (stat_boxplot).
Expectations: We expect a straight line from origin to the top right. There are o other expectations, we believe this one will be cut and dry.
Post Code: This is the most interesting plot of all the plots. We expected this to be a linear trend line and there to be no variability. However, what we got was happily surprising. We can see that countries generally take three approaches to reserves. Either the country holds a lot Total Reserves and not a lot of Foreign Reserves or vice verse. The Final approach is to hold neither (bottom left corner). Notable points of data: UK blows out the competition in Foreign Reserves and China blows out the competition in Total Reserves. This makes sense as the UK would want to hold a lot of USD and China would hold a lot of yuan as they have a closed economic system and trade a lot with the US. Our expectations were blown away. This plot is fun to observe.
ZerodTotal = TotalReservesG20 %>% replace(is.na(.), 0) #Set all na values to 0
ZerodForeign = FReservesG20 %>% replace(is.na(.), 0) #Set all na values to 0
YearlyReserveData <- data.frame(matrix(ncol = 4, nrow = 0)) #Create a new table with 4 rows that is empty.
#for each year (1991 - 2018) get data from each country for Total Reserves in a place holder table. Then find add a column from foreign reserves data to place holder table. Then append the final place holder table data to YearlyReserveData.
for (x in 1:28) {
#Get data from each country for Total Reserves
InterYearTotal = ZerodTotal %>%
rowid_to_column() %>%
filter(rowid == x) %>%
gather(col, val, Canada:Belgium) %>%
select(-rowid)
names(InterYearTotal) <- c("Year", "Country", "TotalReserves") #Rename
#Get data from each country for Foreign Reserves
InterYearForiegn = ZerodForeign %>%
rowid_to_column() %>%
filter(rowid == x) %>%
gather(col, val, Canada:Belgium) %>%
select(-rowid, -X, -col)
InterYearTotal$ForiegnReserves = InterYearForiegn$val #Add column from Foreign reserves to total reserves data
YearlyReserveData = rbind(YearlyReserveData, InterYearTotal) #Append final place holder table data to YearlyReserveData
}
#plot
YearlyReserveData %>% ggplot(aes(x = TotalReserves, y = ForiegnReserves, color = Country)) +
geom_point() +
labs(title = "Scatter plot of Total vs Foreign Reserves per Country", y = "ForiegnReserves", x = "TotalReserves")